# Install the libraries (if using binder)
#!pip install numpy
#!pip install pandas
#!pip install matplotlib
#!pip install seaborn
#!pip install pylab
#!pip install plotly
#!pip install geopy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pylab as plot
import plotly.graph_objects as go
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="my_user_agent")
df = pd.read_csv("pollution_us_2000_2016.csv")
print(f'The dataframe has {len(df)} rows and {df.shape[1]} columns')
Based on the above numbers, one concludes that the dataframe corresponds to very large dataset (Big Data). Therefore, it is not advisable to view the entire dataframe as that will be a super memory expensive task. One needs to perform the analytics in a smarter way so as to the get the desired results while making sure not to put too much load on the memory.
all_states = df['State'].unique()
num_states = len(all_states)
print(f'There are {num_states} states in the dataset.')
print(f'The columns in the dataframe are given by \n{df.columns}')
One would now like to get rid of the columns that are not going to be useful for our analytics.
# columns to be dropped
drop_cols = ['Unnamed: 0', 'State Code', 'County Code', 'Site Num', 'Address', 'NO2 1st Max Value', 'NO2 1st Max Hour', 'NO2 AQI', 'O3 1st Max Value', 'O3 1st Max Hour', 'O3 AQI', 'SO2 1st Max Value', 'SO2 1st Max Hour', 'SO2 AQI', 'CO 1st Max Value', 'CO 1st Max Hour', 'CO AQI']
df.drop(drop_cols, axis=1, inplace=True)
print(f'The dataframe now contains {len(df)} rows and {df.shape[1]} columns')
To get some idea about the entries in the dataframe, instad of looking at the entire dataframe, one peeks at only few rows.
df.head()
By peeking at the above tiny dataframe, one sees that the units are not uniform for all the gases. Before converting all the units, one must make sure if all the entries in particular units column are the same, i.e., for a gas x, one would like to make sure if it is always measured in the same units.}
def same_units(df, col):
""" Checks if all the entries of a column in the dataframe are same. In the context of the project,
it checks if all unit columns have same units.
Args:
df (dataframe): the dataframe whose column is to be inspected.
col (str): the column name whose entries are to to be inspected.
Returns:
True if all the entries are same, else False.
"""
if len(df[col].unique()) == 1:
return True
return False
def get_units(df, col):
"""
Returns all the units present in the unit column of the dataframe.
"""
return df[col].unique()
same_units(df, 'NO2 Units')
get_units(df, 'NO2 Units')
same_units(df, 'SO2 Units')
get_units(df, 'SO2 Units')
same_units(df, 'O3 Units')
get_units(df, 'O3 Units')
same_units(df, 'CO Units')
get_units(df, 'CO Units')
Note that, $NO_2$ and $SO_2$ are always measured in parts per billion (ppb), while $O_3$ and $CO$ are always measured in parts per million (ppm).
A standard unit in the field of science is parts per million (ppm), hence one would like to convert $NO_2$ and $SO_2$ into parts per million (ppm).
df['NO2 Mean'] = df['NO2 Mean']/1000
df['SO2 Mean'] = df['SO2 Mean']/1000
The columns describing the units are now irrelevant, hence, one would like to discard all of these columns to get even more finer dataframe.
unit_cols = ['NO2 Units', 'SO2 Units', 'O3 Units', 'CO Units']
df.drop(unit_cols, axis=1, inplace=True)
Before, moving ahead, a good practice is to check if there are any NaN, i.e., missing values in the dataframe.
df.isna().any()
Therefore, none of the above columns contain any missing value. Great!!
To see the trends with time, it is a good practice to have the date column as the index of the dataframe.
df.rename(columns={'Date Local':'Date'}, inplace=True)
# first check the type of the Date column
df['Date'].dtypes
One would like to convert this into a 'DateTime Object' since that makes it easier to parse dates and do analysis on it.
df['Date'] = pd.to_datetime(df['Date'])
# make 'Date' as the index of the dataframe
df.set_index('Date', inplace=True)
# let us take a peek at the dataframe
df.tail()
In order to quantify $NO_2$, $O_3$, $SO_2$ and $CO$ as one single entity for each state, one would like to get a Total Mean column, that allows us to do exactly that. The entries of this columns will act as a a measure for each state and will help us see how each of these states have performed through time.
df['Total Mean'] = df['NO2 Mean'] + df['SO2 Mean'] + df['CO Mean'] + df['O3 Mean']
df.head()
The goal now is to create individual dataframes for each of the gases, with columns representing each of the states. One would also like to discard the states that have missing value for any of the years.
Note that, we the original dataset has 47 states (as computed before). It might be the case, that not all of them have values recorded for all the yeras from 2000 to 2016. We would like to get rid of them.
def get_df(gas_name):
"""
Computes dataframe for a particular gas with rows as years from 2000 to 2016 and columns as states.
Args:
gas_name (str): can be any of the NO2 Mean, SO2 Mean, O3 Mean, CO Mean or Total Mean.
Returns:
result (dataframe): rows are years, columns are states and an entry corresponds to average of all the
emissions of the gas for the corresponding year and state.
"""
global df, all_states
df_year_list = [df.loc[str(i)] for i in range(2000, 2017)]
# populates dictionary with states as the keys and list of values
dict_of_lists = {state:[] for state in all_states}
for df_year in df_year_list:
for state in all_states:
# extract dataframe for that particular state
state_vals = df_year[df_year['State'] == state]
# get the average of all the values for the particular state and df_year
dict_of_lists[state].append(state_vals[gas_name].mean())
# convert the dictionary to a dataframe
result = pd.DataFrame(dict_of_lists)
# change the indices so that they reflect years
index_list = [item for item in range(2000, 2017)]
result.index = index_list
# get only the columns for which one has non-NaN values.
result.dropna(axis = 1, inplace=True)
return result
# dataframe for no2
no2_df = get_df('NO2 Mean')
# dataframe for so2
so2_df = get_df('SO2 Mean')
# dataframe for co
co_df = get_df('CO Mean')
# dataframe for o3
o3_df = get_df('O3 Mean')
# dataframe for 'Total Mean'
total_df = get_df('Total Mean')
Let us check one of the dataframes to see how it looks:
no2_df
print(f'There are {len(no2_df.columns)} states in the datframe corresponding to the nitrogen dioxide emissions.')
Naturally, one would like to know if all the dataframes have same states.
print(total_df.columns == no2_df.columns)
print(no2_df.columns == o3_df.columns)
print(o3_df.columns == so2_df.columns)
print(so2_df.columns == co_df.columns)
Therefore, only these 14 states have their data recorded from 2000 to 2016. Rest other states have missing data, hence are discareded in our analysis.
Let us plot individual line plots for each of the gases to infer a visualization trend of how each of the 14 states are performed from 2000 to 2016 for each of the gases.
params = {'legend.fontsize': 23,
'legend.handlelength': 2,
'font.size': 27}
plot.rcParams.update(params)
# Setting the style of the Seaborn line Plots
sns.set_style("whitegrid")
plot_colors = ['#ff0000', '#940606', '#5c1414', '#ff8c00', '#965709', '#b59509', '#91790c', '#84910c', '#c1db00', '#37db00', '#216e08', '#00ffd9', '#033b6b', '#0089ff']
plot_line_styles = ['-', '--','-.', '-.', '--', '-', '--','-.', '-.', '--', '--','-.', '-.', '-']
# SO2
so2_df.plot(figsize=(40,25), color = plot_colors, linewidth=3.5, style = plot_line_styles)
plt.title("Sulphur Dioxide Trends from 2000 to 2016")
plt.xlabel("Year")
plt.ylabel("SO2 (ppm)")
# NO2
no2_df.plot(figsize=(40,25), color = plot_colors, linewidth=3.5, style = plot_line_styles)
plt.title("Nitrogen Dioxide Trends from 2000 to 2016")
plt.xlabel("Year")
plt.ylabel("NO2 (ppm)")
# O3
o3_df.plot(figsize=(40,25), color = plot_colors, linewidth=3.5, style = plot_line_styles)
plt.title("Ozone Trends from 2000 to 2016")
plt.xlabel("Year")
plt.ylabel("O3 (ppm)")
# CO
co_df.plot(figsize=(40,25), color = plot_colors, linewidth=3.5, style = plot_line_styles)
plt.title("Carbon Monoxide Trends from 2000 to 2016")
plt.xlabel("Year")
plt.ylabel("CO (ppm)")
# Total Mean
total_df.plot(figsize=(40,25), color = plot_colors, linewidth=3.5, style = plot_line_styles)
plt.title("Total Gas Trends from 2000 to 2016")
plt.xlabel("Year")
plt.ylabel("Total Mean (ppm)")
One would now like to know which of the states have most variance (spread/standard deviation) in the values from 2000 to 2016. In order to accomplish this, an ordered dictionary has been created with keys as state names and corresponding values as the standard deviation of their values.
def get_std_dict(df):
"""
Returns an ordered dictionary with keys as states and values as standard deviation of gas emissions from
2000 to 2016.
Args:
df (dataframe): corresponds to one the gas dataframes computed using get_df(gas_name) function.
Returns:
sorted_dict (dict): dictionary with keys as states and values as standard devations of gas emissions.
It is sorted in ascending order of values.
"""
std_dict = {col:df[col].std() for col in df.columns}
sorted_dict = sorted(std_dict.items(), key=lambda kv: kv[1])
return sorted_dict
get_std_dict(total_df)
From the above dictionary, one can infer that:
get_std_dict(o3_df)
From the above dictionary, one can infer that:
get_std_dict(no2_df)
From the above dictionary, one can infer that:
get_std_dict(so2_df)
From the above dictionary, one can infer that:
get_std_dict(co_df)
From the above dictionary, one can infer that:
One of the main goals of this project/notebook is to able to visualize the data and make inferences from it. The above spreads (standard deviations) can be visualized using boxplots.
sns.set_theme(style="ticks")
# NO2
# Initialize the figure
f, ax = plt.subplots(figsize=(9, 7))
# Plot the orbital period with horizontal boxes
sns.boxplot(x="value", y="variable", data=pd.melt(no2_df),
whis=[0, 100], width=.6, palette="husl")
# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(ylabel="State", xlabel = "NO2 Levels")
sns.despine(trim=True, left=True)
#SO2
# Initialize the figure
f, ax = plt.subplots(figsize=(9, 7))
# Plot the orbital period with horizontal boxes
sns.boxplot(x="value", y="variable", data=pd.melt(so2_df),
whis=[0, 100], width=.6, palette="husl")
# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(ylabel="State", xlabel = "SO2 Levels")
sns.despine(trim=True, left=True)
#CO
# Initialize the figure
f, ax = plt.subplots(figsize=(9, 7))
# Plot the orbital period with horizontal boxes
sns.boxplot(x="value", y="variable", data=pd.melt(co_df),
whis=[0, 100], width=.6, palette="husl")
# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(ylabel="State", xlabel = "CO Levels")
sns.despine(trim=True, left=True)
#O3
# Initialize the figure
f, ax = plt.subplots(figsize=(9, 7))
# Plot the orbital period with horizontal boxes
sns.boxplot(x="value", y="variable", data=pd.melt(o3_df),
whis=[0, 100], width=.6, palette="husl")
# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(ylabel="State", xlabel = "O3 Levels")
sns.despine(trim=True, left=True)
#Total
# Initialize the figure
f, ax = plt.subplots(figsize=(9, 7))
# Plot the orbital period with horizontal boxes
sns.boxplot(x="value", y="variable", data=pd.melt(total_df),
whis=[0, 100], width=.6, palette="husl")
# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(ylabel="State", xlabel = "CO Levels")
sns.despine(trim=True, left=True)
The goal of this section is to present a visualization of yearly emission of gases for each of the state. The visualizations presented in this section are more interactive than previous sessions as one can zoom in/out and pan the graphs. The graphs created in this section are geographic graphs where latitude and longitude coordinates have been taken into account. The graphs are also saved in a html format. This is because getting the latitutdes and longitudes is computationally super expensive. Hence, if one is just interested in the output or doesn't have necessary fast processors to run the code, the html files will be still accessible.
def get_city_df(state, df):
"""
Produces a dataframe for a particular state. The entries for all the cities are averaged so as to have
one entry/measurement per city.
Args:
state (str): name of the state.
df (dataframe): the dataframe from which entries for a particular state are extracted.
Returns:
sample_df (dataframe): extracted dataframe from df for a particular state.
"""
cols = ['City', 'NO2 Mean', 'O3 Mean', 'SO2 Mean', 'CO Mean', 'Total Mean']
sample_df = pd.DataFrame(columns = cols)
cities = list(df['City'].unique())
for city in cities:
new_dict = {'State': state, 'City': city}
city_df = df[df['City'] == city]
no2_mean = city_df['NO2 Mean'].mean()
new_dict['NO2 Mean'] = no2_mean
o3_mean = city_df['O3 Mean'].mean()
new_dict['O3 Mean'] = o3_mean
so2_mean = city_df['SO2 Mean'].mean()
new_dict['SO2 Mean'] = so2_mean
co_mean = city_df['CO Mean'].mean()
new_dict['CO Mean'] = co_mean
total_mean = city_df['Total Mean'].mean()
new_dict['Total Mean'] = total_mean
sample_df = sample_df.append(new_dict, ignore_index=True)
return sample_df
states = list(no2_df.columns)
def year_df(year):
"""
Produces a dataframe for all the states and cities for a particular year.
The function calls get_city_df(state, df) to populate the final dataframe with every state and city.
Args:
year (str): any value from 2000 to 2016.
Returns:
final_df (dataframe): has all the gas emission values for all the states for a year.
"""
df_list = []
global df, states
for state in states:
state_df = df[df['State'] == state]
state_df = state_df.loc[year]
city_df = get_city_df(state, state_df)
df_list.append(city_df)
col_order = ['State', 'City', 'NO2 Mean', 'O3 Mean', 'SO2 Mean', 'CO Mean', 'Total Mean']
final_df = pd.concat(df_list)
final_df = final_df.reindex(columns=col_order)
return final_df
city_pos = {}
def get_city_loc(loc_type, city):
"""
Populates city_pos dictionary with latitudes and longitudes to make their computation more efficient
as one would then only compute them once for a particular location. Every time, that location is used,
city_pos dictionary is used.
"""
if(city in city_pos and loc_type in city_pos[city]):
return city_pos[city][loc_type]
else:
if(city not in city_pos):
city_pos[city] = {}
if(loc_type == "lon"):
val = geolocator.geocode(city)
if(val):
val = val.longitude
else:
val = geolocator.geocode(city)
if(val):
val = val.latitude
val = val if val != None else 'NF'
city_pos[city][loc_type] = val
return val
def get_lat_long(df):
"""
Adds latitudes and longitudes columns to the dataframe (df) based on the 'City' column.
"""
df['Longitude'] = df['City'].apply(lambda x: get_city_loc('lon', x))
df['Latitude'] = df['City'].apply(lambda x: get_city_loc('lat', x))
return df
def show_data_year(gas_mean, year):
"""
Produces a plot on the map with interactive features for a particular gas and particular year.
The plot is saved as html file as well as shown in the Jupyter Notebook.
Args:
gas_mean (str): any of the values from NO2 Mean, O3 Mean, SO2 Mean, CO Mean or Total Mean.
year (str): any value from 2000 to 2016 inclusive.
"""
state_city_df = year_df(year)
state_city_df = get_lat_long(state_city_df)
state_city_df['text'] = state_city_df['State'] + ' - ' + state_city_df['City']
sizes = state_city_df[gas_mean] / sum(state_city_df[gas_mean]) * 800
fig = go.Figure(data=go.Scattergeo(
lon = state_city_df['Longitude'],
lat = state_city_df['Latitude'],
locationmode = 'USA-states',
text = state_city_df['text'],
mode = 'markers',
marker = dict(
size = sizes,
opacity = 1,
reversescale = True,
autocolorscale = False,
colorscale = 'Agsunset',
cmin = 0,
color = sizes,
cmax = sizes.max(),
colorbar_title=f"Gas Levels <br> (Scaled)"
)
))
fig.update_layout(
title = f'{gas_mean[: gas_mean.find(" ")]} levels for {year}',
geo_scope='usa',
)
plot_name = f'{gas_mean[:gas_mean.find(" ")]}_{year}.html'
fig.write_html(plot_name, default_width="60%")
fig.show()
show_data_year("Total Mean", "2016")
Seems like Pheonix, Arizona has maximum total gas emissions in 2016, followed by Winter Park, Florida.
show_data_year("Total Mean", "2000")
Interestingly, from previous two graphs, one sees that Pheonix, Arizona was not leading in maximum gas emissions in 2000 and same goes with Winter Park, Florida. But over the course of 17 years, these two cities have rised up to the top.
show_data_year("NO2 Mean", "2016")
In year 2016, Denver, Colorado is leading with maximum nitrogen dioxide emissions, followed by Pheonix, Arizona.
show_data_year("NO2 Mean", "2000")
Again, it is quite astonishing to see that Denver, Colorado had one of the least nitrogen dioxide emissions in 2000, but rose to the top in the span of 17 years. Pheonix, Arizona, on the other hand, was among the top in 2000 (with Los Angles, California and Burbank, California) and is still among the top in 2016.
One can run the above function to produce visualizations for each year. The visualizations are quite fun to play with and can help draw easy inferences. We have presented a couple of inferences in this notebook by just comparing the values in 2000 and 2016 for total gas emission levels and nitrogen dioxide emission levels. We encourage more inferences to be made by the user for for the remaining gases as well as for other years.